From: Alex Crichton Date: Sat, 9 Sep 2017 01:30:37 +0000 (-0700) Subject: Periodically gc repos in Cargo X-Git-Tag: archive/raspbian/0.35.0-2+rpi1~3^2^2^2^2^2^2^2~22^2~6^2~53^2 X-Git-Url: https://dgit.raspbian.org/%22http://www.example.com/cgi/success//%22http:/www.example.com/cgi/success/?a=commitdiff_plain;h=fe5a5c783fd7a39a0b68ea195ff4a62a12ea750c;p=cargo.git Periodically gc repos in Cargo This commit is targeted at improving the long-term management of git checkouts and git repositories. Currently every time data is fetched from crates.io libgit2 will create a new pack file in the repository. These pack files accumulate over time and end up causing pathological behavior if there's lots of them, causing libgit2 to open many file descriptors all at once, possibly blowing the system's file descriptor limits. To alleviate this problem you typically run `git gc`, but libgit2 doesn't have this implemented. Instead what Cargo now does is detect this situation and run literally the command line tool `git gc` in a best-effort attempt to compact the repo. Failing that, for example when git isn't installed, Cargo will remove the entire repo and do a full checkout again. At the same time this commit also generalizes this logic, plus the existing fast path github logic, to all git repositories and not just the index. That way all git repositories can benefit from the "github fast path" as well as the compaction steps. Closes #4403 --- diff --git a/src/cargo/sources/git/utils.rs b/src/cargo/sources/git/utils.rs index e715d63d1..030bf764b 100644 --- a/src/cargo/sources/git/utils.rs +++ b/src/cargo/sources/git/utils.rs @@ -1,8 +1,11 @@ use std::env; use std::fmt; use std::fs::{self, File}; +use std::mem; use std::path::{Path, PathBuf}; +use std::process::Command; +use curl::easy::{Easy, List}; use git2::{self, ObjectType}; use serde::ser::{self, Serialize}; use url::Url; @@ -90,8 +93,8 @@ impl GitRemote { pub fn checkout(&self, into: &Path, cargo_config: &Config) -> CargoResult { let repo = match git2::Repository::open(into) { - Ok(repo) => { - self.fetch_into(&repo, cargo_config).chain_err(|| { + Ok(mut repo) => { + self.fetch_into(&mut repo, cargo_config).chain_err(|| { format!("failed to fetch into {}", into.display()) })?; repo @@ -119,21 +122,19 @@ impl GitRemote { }) } - fn fetch_into(&self, dst: &git2::Repository, cargo_config: &Config) -> CargoResult<()> { + fn fetch_into(&self, dst: &mut git2::Repository, cargo_config: &Config) -> CargoResult<()> { // Create a local anonymous remote in the repository to fetch the url - let url = self.url.to_string(); let refspec = "refs/heads/*:refs/heads/*"; - fetch(dst, &url, refspec, cargo_config) + fetch(dst, &self.url, refspec, cargo_config) } fn clone_into(&self, dst: &Path, cargo_config: &Config) -> CargoResult { - let url = self.url.to_string(); if fs::metadata(&dst).is_ok() { fs::remove_dir_all(dst)?; } fs::create_dir_all(dst)?; - let repo = git2::Repository::init_bare(dst)?; - fetch(&repo, &url, "refs/heads/*:refs/heads/*", cargo_config)?; + let mut repo = git2::Repository::init_bare(dst)?; + fetch(&mut repo, &self.url, "refs/heads/*:refs/heads/*", cargo_config)?; Ok(repo) } } @@ -147,7 +148,7 @@ impl GitDatabase { -> CargoResult { let checkout = match git2::Repository::open(dest) { Ok(repo) => { - let checkout = GitCheckout::new(dest, self, rev, repo); + let mut checkout = GitCheckout::new(dest, self, rev, repo); if !checkout.is_fresh() { checkout.fetch(cargo_config)?; checkout.reset()?; @@ -244,7 +245,7 @@ impl<'a> GitCheckout<'a> { let url = source.to_url()?; let url = url.to_string(); - let repo = git2::Repository::clone(&url, into) + let repo = git2::Repository::clone(&url, into) .chain_err(|| { internal(format!("failed to clone {} into {}", source.display(), into.display())) @@ -262,12 +263,11 @@ impl<'a> GitCheckout<'a> { } } - fn fetch(&self, cargo_config: &Config) -> CargoResult<()> { + fn fetch(&mut self, cargo_config: &Config) -> CargoResult<()> { info!("fetch {}", self.repo.path().display()); let url = self.database.path.to_url()?; - let url = url.to_string(); let refspec = "refs/heads/*:refs/heads/*"; - fetch(&self.repo, &url, refspec, cargo_config)?; + fetch(&mut self.repo, &url, refspec, cargo_config)?; Ok(()) } @@ -329,7 +329,7 @@ impl<'a> GitCheckout<'a> { let target = repo.head()?.target(); Ok((target, repo)) }); - let repo = match head_and_repo { + let mut repo = match head_and_repo { Ok((head, repo)) => { if child.head_id() == head { return Ok(()) @@ -345,7 +345,8 @@ impl<'a> GitCheckout<'a> { // Fetch data from origin and reset to the head commit let refspec = "refs/heads/*:refs/heads/*"; - fetch(&repo, url, refspec, cargo_config).chain_err(|| { + let url = url.to_url()?; + fetch(&mut repo, &url, refspec, cargo_config).chain_err(|| { internal(format!("failed to fetch submodule `{}` from {}", child.name().unwrap_or(""), url)) })?; @@ -557,8 +558,8 @@ fn with_authentication(url: &str, cfg: &git2::Config, mut f: F) }) } -pub fn fetch(repo: &git2::Repository, - url: &str, +pub fn fetch(repo: &mut git2::Repository, + url: &Url, refspec: &str, config: &Config) -> CargoResult<()> { if !config.network_allowed() { @@ -566,20 +567,167 @@ pub fn fetch(repo: &git2::Repository, was specified") } - with_authentication(url, &repo.config()?, |f| { + // If we're fetching from github, attempt github's special fast path for + // testing if we've already got an up-to-date copy of the repository + if url.host_str() == Some("github.com") { + if let Ok(oid) = repo.refname_to_id("refs/remotes/origin/master") { + let mut handle = config.http()?.borrow_mut(); + debug!("attempting github fast path for {}", url); + if github_up_to_date(&mut handle, url, &oid) { + return Ok(()) + } else { + debug!("fast path failed, falling back to a git fetch"); + } + } + } + + // We reuse repositories quite a lot, so before we go through and update the + // repo check to see if it's a little too old and could benefit from a gc. + // In theory this shouldn't be too too expensive compared to the network + // request we're about to issue. + maybe_gc_repo(repo)?; + + debug!("doing a fetch for {}", url); + with_authentication(url.as_str(), &repo.config()?, |f| { let mut cb = git2::RemoteCallbacks::new(); cb.credentials(f); // Create a local anonymous remote in the repository to fetch the url - let mut remote = repo.remote_anonymous(url)?; + let mut remote = repo.remote_anonymous(url.as_str())?; let mut opts = git2::FetchOptions::new(); opts.remote_callbacks(cb) .download_tags(git2::AutotagOption::All); network::with_retry(config, || { + debug!("initiating fetch of {} from {}", refspec, url); remote.fetch(&[refspec], Some(&mut opts), None) .map_err(CargoError::from) })?; Ok(()) }) } + +/// Cargo has a bunch of long-lived git repositories in its global cache and +/// some, like the index, are updated very frequently. Right now each update +/// creates a new "pack file" inside the git database, and over time this can +/// cause bad performance and bad current behavior in libgit2. +/// +/// One pathological use case today is where libgit2 opens hundreds of file +/// descriptors, getting us dangerously close to blowing out the OS limits of +/// how many fds we can have open. This is detailed in #4403. +/// +/// To try to combat this problem we attempt a `git gc` here. Note, though, that +/// we may not even have `git` installed on the system! As a result we +/// opportunistically try a `git gc` when the pack directory looks too big, and +/// failing that we just blow away the repository and start over. +fn maybe_gc_repo(repo: &mut git2::Repository) -> CargoResult<()> { + // Here we arbitrarily declare that if you have more than 100 files in your + // `pack` folder that we need to do a gc. + let entries = match repo.path().join("objects/pack").read_dir() { + Ok(e) => e.count(), + Err(_) => { + debug!("skipping gc as pack dir appears gone"); + return Ok(()) + } + }; + let max = env::var("__CARGO_PACKFILE_LIMIT").ok() + .and_then(|s| s.parse::().ok()) + .unwrap_or(100); + if entries < max { + debug!("skipping gc as there's only {} pack files", entries); + return Ok(()) + } + + // First up, try a literal `git gc` by shelling out to git. This is pretty + // likely to fail though as we may not have `git` installed. Note that + // libgit2 doesn't currently implement the gc operation, so there's no + // equivalent there. + match Command::new("git").arg("gc").current_dir(repo.path()).output() { + Ok(out) => { + debug!("git-gc status: {}\n\nstdout ---\n{}\nstderr ---\n{}", + out.status, + String::from_utf8_lossy(&out.stdout), + String::from_utf8_lossy(&out.stderr)); + if out.status.success() { + let new = git2::Repository::open(repo.path())?; + mem::replace(repo, new); + return Ok(()) + } + } + Err(e) => debug!("git-gc failed to spawn: {}", e), + } + + // Alright all else failed, let's start over. + // + // Here we want to drop the current repository object pointed to by `repo`, + // so we initialize temporary repository in a sub-folder, blow away the + // existing git folder, and then recreate the git repo. Finally we blow away + // the `tmp` folder we allocated. + let path = repo.path().to_path_buf(); + let tmp = path.join("tmp"); + mem::replace(repo, git2::Repository::init(&tmp)?); + for entry in path.read_dir()? { + let entry = entry?; + if entry.file_name().to_str() == Some("tmp") { + continue + } + let path = entry.path(); + drop(fs::remove_file(&path).or_else(|_| fs::remove_dir_all(&path))); + } + if repo.is_bare() { + mem::replace(repo, git2::Repository::init_bare(path)?); + } else { + mem::replace(repo, git2::Repository::init(path)?); + } + fs::remove_dir_all(&tmp).chain_err(|| { + format!("failed to remove {:?}", tmp) + })?; + Ok(()) +} + +/// Updating the index is done pretty regularly so we want it to be as fast as +/// possible. For registries hosted on github (like the crates.io index) there's +/// a fast path available to use [1] to tell us that there's no updates to be +/// made. +/// +/// This function will attempt to hit that fast path and verify that the `oid` +/// is actually the current `master` branch of the repository. If `true` is +/// returned then no update needs to be performed, but if `false` is returned +/// then the standard update logic still needs to happen. +/// +/// [1]: https://developer.github.com/v3/repos/commits/#get-the-sha-1-of-a-commit-reference +/// +/// Note that this function should never cause an actual failure because it's +/// just a fast path. As a result all errors are ignored in this function and we +/// just return a `bool`. Any real errors will be reported through the normal +/// update path above. +fn github_up_to_date(handle: &mut Easy, url: &Url, oid: &git2::Oid) -> bool { + macro_rules! try { + ($e:expr) => (match $e { + Some(e) => e, + None => return false, + }) + } + + // This expects github urls in the form `github.com/user/repo` and nothing + // else + let mut pieces = try!(url.path_segments()); + let username = try!(pieces.next()); + let repo = try!(pieces.next()); + if pieces.next().is_some() { + return false + } + + let url = format!("https://api.github.com/repos/{}/{}/commits/master", + username, repo); + try!(handle.get(true).ok()); + try!(handle.url(&url).ok()); + try!(handle.useragent("cargo").ok()); + let mut headers = List::new(); + try!(headers.append("Accept: application/vnd.github.3.sha").ok()); + try!(headers.append(&format!("If-None-Match: \"{}\"", oid)).ok()); + try!(handle.http_headers(headers).ok()); + try!(handle.perform().ok()); + + try!(handle.response_code().ok()) == 304 +} diff --git a/src/cargo/sources/registry/remote.rs b/src/cargo/sources/registry/remote.rs index 2f9f7c87e..27870d67c 100644 --- a/src/cargo/sources/registry/remote.rs +++ b/src/cargo/sources/registry/remote.rs @@ -4,11 +4,9 @@ use std::io::prelude::*; use std::mem; use std::path::Path; -use curl::easy::{Easy, List}; use git2; use hex::ToHex; use serde_json; -use url::Url; use core::{PackageId, SourceId}; use ops; @@ -24,7 +22,6 @@ pub struct RemoteRegistry<'cfg> { cache_path: Filesystem, source_id: SourceId, config: &'cfg Config, - handle: LazyCell>, tree: RefCell>>, repo: LazyCell, head: Cell>, @@ -39,18 +36,11 @@ impl<'cfg> RemoteRegistry<'cfg> { source_id: source_id.clone(), config: config, tree: RefCell::new(None), - handle: LazyCell::new(), repo: LazyCell::new(), head: Cell::new(None), } } - fn easy(&self) -> CargoResult<&RefCell> { - self.handle.get_or_try_init(|| { - ops::http_handle(self.config).map(RefCell::new) - }) - } - fn repo(&self) -> CargoResult<&git2::Repository> { self.repo.get_or_try_init(|| { let path = self.index_path.clone().into_path_unlocked(); @@ -170,38 +160,22 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { // hit the index, which may not actually read this configuration. ops::http_handle(self.config)?; - let repo = self.repo()?; + self.repo()?; + self.head.set(None); + *self.tree.borrow_mut() = None; let _lock = self.index_path.open_rw(Path::new(INDEX_LOCK), self.config, "the registry index")?; self.config.shell().status("Updating", format!("registry `{}`", self.source_id.url()))?; - let mut needs_fetch = true; - - if self.source_id.url().host_str() == Some("github.com") { - if let Ok(oid) = self.head() { - let mut handle = self.easy()?.borrow_mut(); - debug!("attempting github fast path for {}", - self.source_id.url()); - if github_up_to_date(&mut handle, self.source_id.url(), &oid) { - needs_fetch = false; - } else { - debug!("fast path failed, falling back to a git fetch"); - } - } - } - - if needs_fetch { - // git fetch origin master - let url = self.source_id.url().to_string(); - let refspec = "refs/heads/master:refs/remotes/origin/master"; - git::fetch(&repo, &url, refspec, self.config).chain_err(|| { - format!("failed to fetch `{}`", url) - })?; - } - self.head.set(None); - *self.tree.borrow_mut() = None; + // git fetch origin master + let url = self.source_id.url(); + let refspec = "refs/heads/master:refs/remotes/origin/master"; + let repo = self.repo.borrow_mut().unwrap(); + git::fetch(repo, url, refspec, self.config).chain_err(|| { + format!("failed to fetch `{}`", url) + })?; Ok(()) } @@ -240,7 +214,7 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { // download we should resume either from the start or the middle // on the next time let url = url.to_string(); - let mut handle = self.easy()?.borrow_mut(); + let mut handle = self.config.http()?.borrow_mut(); handle.get(true)?; handle.url(&url)?; handle.follow_location(true)?; @@ -284,50 +258,3 @@ impl<'cfg> Drop for RemoteRegistry<'cfg> { self.tree.borrow_mut().take(); } } - -/// Updating the index is done pretty regularly so we want it to be as fast as -/// possible. For registries hosted on github (like the crates.io index) there's -/// a fast path available to use [1] to tell us that there's no updates to be -/// made. -/// -/// This function will attempt to hit that fast path and verify that the `oid` -/// is actually the current `master` branch of the repository. If `true` is -/// returned then no update needs to be performed, but if `false` is returned -/// then the standard update logic still needs to happen. -/// -/// [1]: https://developer.github.com/v3/repos/commits/#get-the-sha-1-of-a-commit-reference -/// -/// Note that this function should never cause an actual failure because it's -/// just a fast path. As a result all errors are ignored in this function and we -/// just return a `bool`. Any real errors will be reported through the normal -/// update path above. -fn github_up_to_date(handle: &mut Easy, url: &Url, oid: &git2::Oid) -> bool { - macro_rules! try { - ($e:expr) => (match $e { - Some(e) => e, - None => return false, - }) - } - - // This expects github urls in the form `github.com/user/repo` and nothing - // else - let mut pieces = try!(url.path_segments()); - let username = try!(pieces.next()); - let repo = try!(pieces.next()); - if pieces.next().is_some() { - return false - } - - let url = format!("https://api.github.com/repos/{}/{}/commits/master", - username, repo); - try!(handle.get(true).ok()); - try!(handle.url(&url).ok()); - try!(handle.useragent("cargo").ok()); - let mut headers = List::new(); - try!(headers.append("Accept: application/vnd.github.3.sha").ok()); - try!(headers.append(&format!("If-None-Match: \"{}\"", oid)).ok()); - try!(handle.http_headers(headers).ok()); - try!(handle.perform().ok()); - - try!(handle.response_code().ok()) == 304 -} diff --git a/src/cargo/util/config.rs b/src/cargo/util/config.rs index 359ec01ab..fe15dcb6b 100644 --- a/src/cargo/util/config.rs +++ b/src/cargo/util/config.rs @@ -12,17 +12,19 @@ use std::path::{Path, PathBuf}; use std::str::FromStr; use std::sync::{Once, ONCE_INIT}; -use core::{Shell, CliUnstable}; -use core::shell::Verbosity; +use curl::easy::Easy; use jobserver; use serde::{Serialize, Serializer}; use toml; + +use core::shell::Verbosity; +use core::{Shell, CliUnstable}; +use ops; use util::Rustc; use util::errors::{CargoResult, CargoResultExt, CargoError, internal}; use util::paths; -use util::{Filesystem, LazyCell}; - use util::toml as cargo_toml; +use util::{Filesystem, LazyCell}; use self::ConfigValue as CV; @@ -40,6 +42,7 @@ pub struct Config { locked: Cell, jobserver: Option, cli_flags: RefCell, + easy: LazyCell>, } impl Config { @@ -76,6 +79,7 @@ impl Config { } }, cli_flags: RefCell::new(CliUnstable::default()), + easy: LazyCell::new(), } } @@ -537,6 +541,12 @@ impl Config { pub fn jobserver_from_env(&self) -> Option<&jobserver::Client> { self.jobserver.as_ref() } + + pub fn http(&self) -> CargoResult<&RefCell> { + self.easy.get_or_try_init(|| { + ops::http_handle(self).map(RefCell::new) + }) + } } #[derive(Eq, PartialEq, Clone, Copy)] diff --git a/src/cargo/util/lazy_cell.rs b/src/cargo/util/lazy_cell.rs index 8239559d8..6a6383907 100644 --- a/src/cargo/util/lazy_cell.rs +++ b/src/cargo/util/lazy_cell.rs @@ -6,7 +6,7 @@ //! outer object. //! //! The limitation of a `LazyCell` is that after initialized, it can never be -//! modified. +//! modified unless you've otherwise got a `&mut` reference use std::cell::UnsafeCell; @@ -47,6 +47,13 @@ impl LazyCell { } } + /// Same as `borrow`, but the mutable version + pub fn borrow_mut(&mut self) -> Option<&mut T> { + unsafe { + (*self.inner.get()).as_mut() + } + } + /// Consumes this `LazyCell`, returning the underlying value. pub fn into_inner(self) -> Option { unsafe { diff --git a/tests/small-fd-limits.rs b/tests/small-fd-limits.rs new file mode 100644 index 000000000..e8249cafc --- /dev/null +++ b/tests/small-fd-limits.rs @@ -0,0 +1,108 @@ +extern crate cargotest; +extern crate git2; +extern crate hamcrest; +extern crate url; + +use std::env; +use std::ffi::OsStr; +use std::path::PathBuf; +use std::process::Command; + +use cargotest::support::{execs, project}; +use cargotest::support::registry::Package; +use cargotest::support::paths; +use cargotest::support::git; +use hamcrest::assert_that; + +use url::Url; + +fn find_index() -> PathBuf { + let dir = paths::home().join(".cargo/registry/index"); + dir.read_dir().unwrap().next().unwrap().unwrap().path() +} + +fn run_test(path_env: Option<&OsStr>) { + const N: usize = 50; + + let foo = project("foo") + .file("Cargo.toml", r#" + [package] + name = "foo" + version = "0.0.1" + authors = [] + + [dependencies] + bar = "*" + "#) + .file("src/lib.rs", ""); + Package::new("bar", "0.1.0").publish(); + + assert_that(foo.cargo_process("build"), + execs().with_status(0)); + + let index = find_index(); + let path = paths::home().join("tmp"); + let url = Url::from_file_path(&path).unwrap().to_string(); + let repo = git2::Repository::init(&path).unwrap(); + let index = git2::Repository::open(&index).unwrap(); + let mut cfg = repo.config().unwrap(); + cfg.set_str("user.email", "foo@bar.com").unwrap(); + cfg.set_str("user.name", "Foo Bar").unwrap(); + let mut cfg = index.config().unwrap(); + cfg.set_str("user.email", "foo@bar.com").unwrap(); + cfg.set_str("user.name", "Foo Bar").unwrap(); + + for _ in 0..N { + git::commit(&repo); + index.remote_anonymous(&url).unwrap() + .fetch(&["refs/heads/master:refs/remotes/foo/master"], + None, + None).unwrap(); + } + drop((repo, index)); + Package::new("bar", "0.1.1").publish(); + + let before = find_index().join(".git/objects/pack") + .read_dir().unwrap() + .count(); + assert!(before > N); + + let mut cmd = foo.cargo("update"); + cmd.env("__CARGO_PACKFILE_LIMIT", "10"); + if let Some(path) = path_env { + cmd.env("PATH", path); + } + cmd.env("RUST_LOG", "trace"); + assert_that(cmd, execs().with_status(0)); + let after = find_index().join(".git/objects/pack") + .read_dir().unwrap() + .count(); + assert!(after < before, + "packfiles before: {}\n\ + packfiles after: {}", before, after); +} + +#[test] +fn use_git_gc() { + if Command::new("git").arg("--version").output().is_err() { + return + } + run_test(None); +} + +#[test] +// it looks like this test passes on some windows machines but not others, +// notably not on AppVeyor's machines. Sounds like another but for another day. +#[cfg_attr(windows, ignore)] +fn avoid_using_git() { + let path = env::var_os("PATH").unwrap_or(Default::default()); + let mut paths = env::split_paths(&path).collect::>(); + let idx = paths.iter().position(|p| { + p.join("git").exists() || p.join("git.exe").exists() + }); + match idx { + Some(i) => { paths.remove(i); } + None => return, + } + run_test(Some(&env::join_paths(&paths).unwrap())); +}